changes test test …… . . . . not sure if this will work!
here is a new line for the internet
Load packages
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
source("functions.R")
I downloaded the file into R
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")
gapminder <- read.csv("data/gapminder-FiveYearData.csv")
head(gapminder)
## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
I wonder what is the life expectancy over the years
ggplot(data = gapminder, aes(x=year,y=lifeExp))+geom_point()
interactive version
if(!require("plotly")){install.packages("plotly")}
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(plotly)
p <- ggplot(data = gapminder[gapminder$continent=="Americas",], aes(x=gdpPercap,y=lifeExp,color=continent,by=country))+
geom_point()+
scale_x_log10()+
geom_smooth(method = "lm")+
facet_wrap(~country)+
xlab("Gross Domestic Product")+
ylab("Life Expectancy")
ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
If you are repeating yourself in your code, you may be able to solve that problem by making your own function!
cars <- c(3,4,5,6,7,10)
se(cars)
## [1] 1.013794
dplyrYou will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!
Explore select
year_country_gdp <- select(gapminder,year,country,gdpPercap)
year_country_gdp <- select(gapminder,-pop,-continent,-lifeExp)
names(year_country_gdp)
## [1] "country" "year" "gdpPercap"
Explore filter
year_country_gdp_euro <- gapminder %>%
filter(continent=="Europe") %>%
select(year,country,gdpPercap)
euro <- filter(gapminder,continent=="Europe")
year_country_gdp_euro <- select(euro,year,country,gdpPercap)
Exploring the amazing group_by and summarize functions
mean_gdp_percountry <- gapminder %>%
group_by(country) %>%
summarise(mean_gdp=mean(gdpPercap),
se_gdp=se(gdpPercap))
mean_gdp_percountry
## # A tibble: 142 × 3
## country mean_gdp se_gdp
## <fctr> <dbl> <dbl>
## 1 Afghanistan 802.6746 31.23550
## 2 Albania 3255.3666 344.20223
## 3 Algeria 4426.0260 378.26190
## 4 Angola 3607.1005 336.56641
## 5 Argentina 8955.5538 537.68144
## 6 Australia 19980.5956 2256.11315
## 7 Austria 20411.9163 2787.23968
## 8 Bahrain 18077.6639 1563.29518
## 9 Bangladesh 817.5588 67.86165
## 10 Belgium 19900.7581 2422.32683
## # ... with 132 more rows
mean_lifeExp_percontinent <- gapminder %>%
group_by(continent,country) %>%
summarise(mean_lifeExp=mean(lifeExp),
se_lifeExp=se(lifeExp),
length_lifeExp=n())
mean_lifeExp_percontinent
## Source: local data frame [142 x 5]
## Groups: continent [?]
##
## continent country mean_lifeExp se_lifeExp
## <fctr> <fctr> <dbl> <dbl>
## 1 Africa Algeria 59.03017 2.9849208
## 2 Africa Angola 37.88350 1.1562236
## 3 Africa Benin 48.77992 1.7691977
## 4 Africa Botswana 54.59750 1.7116922
## 5 Africa Burkina Faso 44.69400 1.9762099
## 6 Africa Burundi 44.81733 0.9165096
## 7 Africa Cameroon 48.12850 1.5784640
## 8 Africa Central African Republic 43.86692 1.3627459
## 9 Africa Chad 46.77358 1.4110376
## 10 Africa Comoros 52.38175 2.3476081
## # ... with 132 more rows, and 1 more variables: length_lifeExp <int>
Combining ggplot and dplyr
euro_countries <- gapminder %>%
filter(continent=="Europe") %>%
ggplot(aes(x=year,y=lifeExp,color=country)) +
geom_line()+
facet_wrap(~country)
euro_countries
ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry,"processed/mean_gdp_percountry.csv")
tidyrR likes to have ‘long’ format data where every row is an observation and you have a single column for ‘observations’ the others serve to identify that observation. (exceptions apply when you have multiple types of observations) To switch back and forth from ‘wide’ (how we typically enter data in a spreadsheet) to ‘long’ use tidyr
# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")
gapminder_wide <- read.csv("data/gapminder_wide.csv")
gap_long <- gapminder_wide %>%
gather(obstype_year,obs_values,
starts_with('pop'),starts_with('lifeExp'),starts_with('gdpPercap'))
gap_long <- gapminder_wide %>%
gather(obstype_year,obs_values,
3:38)
Separate the obs_type column
gap_normal <- gap_long %>%
separate(obstype_year,into=c("obs_type","year"),sep="_") %>%
spread(obs_type,obs_values)
head(gap_normal)
## continent country year gdpPercap lifeExp pop
## 1 Africa Algeria 1952 2449.008 43.077 9279525
## 2 Africa Algeria 1957 3013.976 45.685 10270856
## 3 Africa Algeria 1962 2550.817 48.303 11000948
## 4 Africa Algeria 1967 3246.992 51.407 12760499
## 5 Africa Algeria 1972 4182.664 54.518 14760787
## 6 Africa Algeria 1977 4910.417 58.014 17152804
gap_normal <- gap_normal %>%
arrange(country,continent,year)
all.equal(gapminder,gap_normal)
## [1] "Names: 5 string mismatches"
## [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
## [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"
## [4] "Component 1: 1704 string mismatches"
## [5] "Component 2: Attributes: < target is NULL, current is list >"
## [6] "Component 2: target is numeric, current is factor"
## [7] "Component 3: Modes: numeric, character"
## [8] "Component 3: target is numeric, current is character"
## [9] "Component 4: 'current' is not a factor"
## [10] "Component 6: Mean relative difference: 4101.546"